In [1]:
!pip install lime
Requirement already satisfied: lime in c:\users\dhanu\appdata\local\programs\python\python312\lib\site-packages (0.2.0.1) Requirement already satisfied: matplotlib in c:\users\dhanu\appdata\local\programs\python\python312\lib\site-packages (from lime) (3.9.2) Requirement already satisfied: numpy in c:\users\dhanu\appdata\local\programs\python\python312\lib\site-packages (from lime) (2.0.2) Requirement already satisfied: scipy in c:\users\dhanu\appdata\local\programs\python\python312\lib\site-packages (from lime) (1.14.1) Requirement already satisfied: tqdm in c:\users\dhanu\appdata\local\programs\python\python312\lib\site-packages (from lime) (4.67.1) Requirement already satisfied: scikit-learn>=0.18 in c:\users\dhanu\appdata\local\programs\python\python312\lib\site-packages (from lime) (1.5.2) Requirement already satisfied: scikit-image>=0.12 in c:\users\dhanu\appdata\local\programs\python\python312\lib\site-packages (from lime) (0.24.0) Requirement already satisfied: networkx>=2.8 in c:\users\dhanu\appdata\local\programs\python\python312\lib\site-packages (from scikit-image>=0.12->lime) (3.4.2) Requirement already satisfied: pillow>=9.1 in c:\users\dhanu\appdata\local\programs\python\python312\lib\site-packages (from scikit-image>=0.12->lime) (10.4.0) Requirement already satisfied: imageio>=2.33 in c:\users\dhanu\appdata\local\programs\python\python312\lib\site-packages (from scikit-image>=0.12->lime) (2.36.0) Requirement already satisfied: tifffile>=2022.8.12 in c:\users\dhanu\appdata\local\programs\python\python312\lib\site-packages (from scikit-image>=0.12->lime) (2024.9.20) Requirement already satisfied: packaging>=21 in c:\users\dhanu\appdata\local\programs\python\python312\lib\site-packages (from scikit-image>=0.12->lime) (24.1) Requirement already satisfied: lazy-loader>=0.4 in c:\users\dhanu\appdata\local\programs\python\python312\lib\site-packages (from scikit-image>=0.12->lime) (0.4) Requirement already satisfied: joblib>=1.2.0 in c:\users\dhanu\appdata\local\programs\python\python312\lib\site-packages (from scikit-learn>=0.18->lime) (1.4.2) Requirement already satisfied: threadpoolctl>=3.1.0 in c:\users\dhanu\appdata\local\programs\python\python312\lib\site-packages (from scikit-learn>=0.18->lime) (3.5.0) Requirement already satisfied: contourpy>=1.0.1 in c:\users\dhanu\appdata\local\programs\python\python312\lib\site-packages (from matplotlib->lime) (1.3.0) Requirement already satisfied: cycler>=0.10 in c:\users\dhanu\appdata\local\programs\python\python312\lib\site-packages (from matplotlib->lime) (0.12.1) Requirement already satisfied: fonttools>=4.22.0 in c:\users\dhanu\appdata\local\programs\python\python312\lib\site-packages (from matplotlib->lime) (4.53.1) Requirement already satisfied: kiwisolver>=1.3.1 in c:\users\dhanu\appdata\local\programs\python\python312\lib\site-packages (from matplotlib->lime) (1.4.5) Requirement already satisfied: pyparsing>=2.3.1 in c:\users\dhanu\appdata\local\programs\python\python312\lib\site-packages (from matplotlib->lime) (3.1.4) Requirement already satisfied: python-dateutil>=2.7 in c:\users\dhanu\appdata\local\programs\python\python312\lib\site-packages (from matplotlib->lime) (2.9.0.post0) Requirement already satisfied: colorama in c:\users\dhanu\appdata\local\programs\python\python312\lib\site-packages (from tqdm->lime) (0.4.6) Requirement already satisfied: six>=1.5 in c:\users\dhanu\appdata\local\programs\python\python312\lib\site-packages (from python-dateutil>=2.7->matplotlib->lime) (1.16.0)
[notice] A new release of pip is available: 23.2.1 -> 24.3.1 [notice] To update, run: python.exe -m pip install --upgrade pip
In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
# Import LIME
import lime.lime_tabular
# Load the dataset
data = pd.read_excel(r"C:/Users/dhanu/OneDrive/Desktop/machinelearning rajeev/ML TRAIN DATASETS/train_t5_embeddings.xlsx")
# Drop any irrelevant columns, such as text or index columns
data = data.drop(columns=['Equation', 'GPT2_Embedding'], errors='ignore')
# Features and target variable
X = data.iloc[:, :-1] # All columns except the last one
y = data['output'] # Target variable
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# List of regression models up to Decision Tree
models = [
('Linear Regression', LinearRegression()),
('Ridge Regression', Ridge()),
('Lasso Regression', Lasso()),
('KNN', KNeighborsRegressor()),
('Decision Tree', DecisionTreeRegressor())
]
# Function to calculate and return performance metrics
def evaluate_model(model, X, y):
# Cross-validation with 10 folds
cv_scores_rmse = cross_val_score(model, X, y, cv=10, scoring='neg_mean_squared_error')
cv_scores_r2 = cross_val_score(model, X, y, cv=10, scoring='r2')
# Compute mean and standard deviation of CV scores
rmse_mean = -cv_scores_rmse.mean() # Convert negative RMSE to positive
rmse_std = cv_scores_rmse.std()
r2_mean = cv_scores_r2.mean()
r2_std = cv_scores_r2.std()
return rmse_mean, rmse_std, r2_mean, r2_std
# Hyperparameter tuning using GridSearchCV for the selected models
param_grids = {
'Linear Regression': {}, # No hyperparameters for linear regression
'Ridge Regression': {'regressor__alpha': [0.1, 1, 10, 100]}, # Note the 'regressor__' prefix
'Lasso Regression': {'regressor__alpha': [0.1, 1, 10]}, # Note the 'regressor__' prefix
'KNN': {'regressor__n_neighbors': [3, 5, 10, 15], 'regressor__weights': ['uniform', 'distance']}, # Note the 'regressor__' prefix
'Decision Tree': {'regressor__max_depth': [None, 5, 10, 20], 'regressor__min_samples_split': [2, 5, 10]} # Note the 'regressor__' prefix
}
# Perform hyperparameter tuning and evaluation for each model
for name, model in models:
print(f"\nTraining and hyperparameter tuning for {name}...")
# Build a pipeline for each model
pipeline = Pipeline([
('scaler', StandardScaler()), # Standardize the features
('pca', PCA(n_components=0.95)), # Apply PCA for dimensionality reduction
('regressor', model) # Model
])
# Hyperparameter tuning with GridSearchCV
param_grid = param_grids.get(name, {})
# Skip models with no hyperparameters to tune
if param_grid:
grid_search = GridSearchCV(pipeline, param_grid, cv=10, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_
print(f"Best {name} model: {grid_search.best_params_}")
else:
pipeline.fit(X_train, y_train)
best_model = pipeline # Use the pipeline as the best model
# Cross-validation after tuning
rmse_mean, rmse_std, r2_mean, r2_std = evaluate_model(best_model, X_train, y_train)
print(f"Cross-validation after tuning for {name}:")
print(f"CV Mean RMSE (after tuning): {rmse_mean}, CV RMSE Std: {rmse_std}")
print(f"CV Mean R² (after tuning): {r2_mean}, CV R² Std: {r2_std}")
# Evaluate the model on the test data
y_pred = best_model.predict(X_test)
# Apply clipping strategy to keep predictions within the 0-5 range
y_pred_clipped = np.clip(y_pred, 0, 5)
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_clipped))
test_r2 = r2_score(y_test, y_pred_clipped)
print(f"\nTest RMSE: {test_rmse}")
print(f"Test R²: {test_r2}")
# LIME Explanation
print(f"\nLIME Explanation for {name}:")
# Convert the training data to numpy arrays for LIME
X_train_np = X_train.to_numpy() if isinstance(X_train, pd.DataFrame) else X_train
feature_names = X.columns.tolist()
# Create a LIME explainer instance
explainer = lime.lime_tabular.LimeTabularExplainer(
X_train_np,
training_labels=y_train.to_numpy(),
mode='regression',
feature_names=feature_names,
verbose=True
)
# Explain the first instance in the test set
lime_exp = explainer.explain_instance(X_test.iloc[0].to_numpy(), best_model.predict, num_features=5)
lime_exp.show_in_notebook()
Training and hyperparameter tuning for Linear Regression... Cross-validation after tuning for Linear Regression: CV Mean RMSE (after tuning): 0.29643360817884284, CV RMSE Std: 0.05338130585877468 CV Mean R² (after tuning): 0.8757317869351426, CV R² Std: 0.020583447413062995 Test RMSE: 0.5289088981538131 Test R²: 0.8901144203318311 LIME Explanation for Linear Regression:
C:\Users\dhanu\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py:493: UserWarning: X does not have valid feature names, but StandardScaler was fitted with feature names warnings.warn(
Intercept 2.694298070314206 Prediction_local [3.09354252] Right: 4.837937298579014
Training and hyperparameter tuning for Ridge Regression...
Best Ridge Regression model: {'regressor__alpha': 100}
Cross-validation after tuning for Ridge Regression:
CV Mean RMSE (after tuning): 0.29233334123603, CV RMSE Std: 0.053173754478938955
CV Mean R² (after tuning): 0.877478808221376, CV R² Std: 0.020353105439757827
Test RMSE: 0.5277177423867241
Test R²: 0.8906088096406924
LIME Explanation for Ridge Regression:
C:\Users\dhanu\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py:493: UserWarning: X does not have valid feature names, but StandardScaler was fitted with feature names warnings.warn(
Intercept 2.6400669587558934 Prediction_local [3.19352113] Right: 4.848054997023594
Training and hyperparameter tuning for Lasso Regression...
Best Lasso Regression model: {'regressor__alpha': 0.1}
Cross-validation after tuning for Lasso Regression:
CV Mean RMSE (after tuning): 0.47471008407376114, CV RMSE Std: 0.07811254918443654
CV Mean R² (after tuning): 0.8011821415065745, CV R² Std: 0.028978128431378893
Test RMSE: 0.6908028328724246
Test R²: 0.812549252632241
LIME Explanation for Lasso Regression:
C:\Users\dhanu\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py:493: UserWarning: X does not have valid feature names, but StandardScaler was fitted with feature names warnings.warn(
Intercept 2.730877464178246 Prediction_local [2.93101529] Right: 5.075368161590885
Training and hyperparameter tuning for KNN...
Best KNN model: {'regressor__n_neighbors': 3, 'regressor__weights': 'distance'}
Cross-validation after tuning for KNN:
CV Mean RMSE (after tuning): 0.41175991957459557, CV RMSE Std: 0.09491977354924565
CV Mean R² (after tuning): 0.8263382304760768, CV R² Std: 0.04224531935297672
Test RMSE: 0.6675267403320166
Test R²: 0.8249684688411176
LIME Explanation for KNN:
C:\Users\dhanu\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py:493: UserWarning: X does not have valid feature names, but StandardScaler was fitted with feature names warnings.warn(
Intercept 3.290188633943761 Prediction_local [3.74078172] Right: 5.000000000000001
Training and hyperparameter tuning for Decision Tree...
Best Decision Tree model: {'regressor__max_depth': 20, 'regressor__min_samples_split': 5}
Cross-validation after tuning for Decision Tree:
CV Mean RMSE (after tuning): 0.858903676269741, CV RMSE Std: 0.19255571892232373
CV Mean R² (after tuning): 0.6031250925037617, CV R² Std: 0.10214507406332363
Test RMSE: 0.8003659427520516
Test R²: 0.7483736273732797
LIME Explanation for Decision Tree:
C:\Users\dhanu\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py:493: UserWarning: X does not have valid feature names, but StandardScaler was fitted with feature names warnings.warn(
Intercept 4.096315735187511 Prediction_local [3.93953366] Right: 5.0
In [6]:
!pip install xgboost
Requirement already satisfied: xgboost in c:\users\dhanu\appdata\local\programs\python\python312\lib\site-packages (2.1.3) Requirement already satisfied: numpy in c:\users\dhanu\appdata\local\programs\python\python312\lib\site-packages (from xgboost) (2.0.2) Requirement already satisfied: scipy in c:\users\dhanu\appdata\local\programs\python\python312\lib\site-packages (from xgboost) (1.14.1)
[notice] A new release of pip is available: 23.2.1 -> 24.3.1 [notice] To update, run: python.exe -m pip install --upgrade pip
In [7]:
import pandas as pd
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.decomposition import PCA
import numpy as np
import lime.lime_tabular
# Load the dataset
data = pd.read_excel(r"C:/Users/dhanu/OneDrive/Desktop/machinelearning rajeev/ML TRAIN DATASETS/train_t5_embeddings.xlsx")
# Drop any irrelevant columns, such as text or index columns
data = data.drop(columns=['Equation', 'GPT2_Embedding'], errors='ignore')
# Features and target variable
X = data.iloc[:, :-1] # All columns except the last one
y = data['output'] # Target variable
# Scale the target variable (y) to range [0, 5] using MinMaxScaler
scaler_y = MinMaxScaler(feature_range=(0, 5))
y_scaled = scaler_y.fit_transform(y.values.reshape(-1, 1)).flatten()
# Standardize the features
scaler_X = StandardScaler()
X_scaled = scaler_X.fit_transform(X)
# Apply PCA for dimensionality reduction
pca = PCA(n_components=0.99) # Preserve 95% of variance
X_pca = pca.fit_transform(X_scaled)
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_pca, y_scaled, test_size=0.2, random_state=42)
# XGBoost model
model = XGBRegressor(random_state=42, n_jobs=-1)
# Hyperparameter tuning grid
param_grid = {
'n_estimators': [100, 200],
'max_depth': [3, 6],
'learning_rate': [0.01, 0.1],
'subsample': [0.8, 1.0]
}
# Hyperparameter tuning
print(f"Training and hyperparameter tuning for XGBoost...")
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_
print(f"Best XGBoost model: {grid_search.best_params_}")
# Evaluate the model on the test data
y_pred_scaled = best_model.predict(X_test)
# Clip predictions to the range [0, 5] to ensure valid outputs
y_pred_scaled_clipped = np.clip(y_pred_scaled, 0, 5)
# Evaluate performance metrics on the scaled target
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_scaled_clipped))
test_r2 = r2_score(y_test, y_pred_scaled_clipped)
print(f"\nTest RMSE (scaled): {test_rmse}")
print(f"Test R² (scaled): {test_r2}")
# Rescale the predictions and test target back to original range for final evaluation
y_pred_original = scaler_y.inverse_transform(y_pred_scaled_clipped.reshape(-1, 1)).flatten()
y_test_original = scaler_y.inverse_transform(y_test.reshape(-1, 1)).flatten()
original_rmse = np.sqrt(mean_squared_error(y_test_original, y_pred_original))
original_r2 = r2_score(y_test_original, y_pred_original)
print(f"\nTest RMSE (original): {original_rmse}")
print(f"Test R² (original): {original_r2}")
# LIME Explanation
print(f"\nLIME Explanation for XGBoost:")
explainer = lime.lime_tabular.LimeTabularExplainer(
X_train,
training_labels=y_train,
mode='regression',
verbose=True,
feature_names=[f'PCA_{i+1}' for i in range(X_train.shape[1])],
feature_selection='auto'
)
lime_exp = explainer.explain_instance(X_test[0], best_model.predict, num_features=5)
lime_exp.show_in_notebook()
Training and hyperparameter tuning for XGBoost...
Best XGBoost model: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200, 'subsample': 1.0}
Test RMSE (scaled): 0.5386805361452687
Test R² (scaled): 0.8860166218904774
Test RMSE (original): 0.5386805361452687
Test R² (original): 0.8860166218904774
LIME Explanation for XGBoost:
Intercept 2.041903063041338
Prediction_local [4.48649382]
Right: 5.025804
In [8]:
import pandas as pd
from sklearn.ensemble import AdaBoostRegressor
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.decomposition import PCA
import numpy as np
import lime.lime_tabular
# Load the dataset
data = pd.read_excel(r"C:/Users/dhanu/OneDrive/Desktop/machinelearning rajeev/ML TRAIN DATASETS/train_t5_embeddings.xlsx")
# Drop any irrelevant columns, such as text or index columns
data = data.drop(columns=['Equation', 'GPT2_Embedding'], errors='ignore')
# Features and target variable
X = data.iloc[:, :-1] # All columns except the last one
y = data['output'] # Target variable
# Scale the target variable (y) to range [0, 5] using MinMaxScaler
scaler_y = MinMaxScaler(feature_range=(0, 5))
y_scaled = scaler_y.fit_transform(y.values.reshape(-1, 1)).flatten()
# Standardize the features
scaler_X = StandardScaler()
X_scaled = scaler_X.fit_transform(X)
# Apply PCA for dimensionality reduction
pca = PCA(n_components=0.99) # Preserve 99% of variance
X_pca = pca.fit_transform(X_scaled)
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_pca, y_scaled, test_size=0.2, random_state=42)
# AdaBoost model
model = AdaBoostRegressor(random_state=42)
# Hyperparameter tuning grid
param_grid = {
'n_estimators': [50, 100], # Number of boosting rounds
'learning_rate': [0.01, 0.1], # Learning rate
'loss': ['linear', 'square'] # Loss function options
}
# Hyperparameter tuning
print(f"Training and hyperparameter tuning for AdaBoost...")
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_
print(f"Best AdaBoost model: {grid_search.best_params_}")
# Evaluate the model on the test data
y_pred_scaled = best_model.predict(X_test)
# Clip predictions to the range [0, 5] to ensure valid outputs
y_pred_scaled_clipped = np.clip(y_pred_scaled, 0, 5)
# Evaluate performance metrics on the scaled target
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_scaled_clipped))
test_r2 = r2_score(y_test, y_pred_scaled_clipped)
print(f"\nTest RMSE (scaled): {test_rmse}")
print(f"Test R² (scaled): {test_r2}")
# Rescale the predictions and test target back to original range for final evaluation
y_pred_original = scaler_y.inverse_transform(y_pred_scaled_clipped.reshape(-1, 1)).flatten()
y_test_original = scaler_y.inverse_transform(y_test.reshape(-1, 1)).flatten()
original_rmse = np.sqrt(mean_squared_error(y_test_original, y_pred_original))
original_r2 = r2_score(y_test_original, y_pred_original)
print(f"\nTest RMSE (original): {original_rmse}")
print(f"Test R² (original): {original_r2}")
# LIME Explanation for AdaBoost
print(f"\nLIME Explanation for AdaBoost:")
explainer = lime.lime_tabular.LimeTabularExplainer(
X_train,
training_labels=y_train,
mode='regression',
verbose=True,
feature_names=[f'PCA_{i+1}' for i in range(X_train.shape[1])],
feature_selection='auto'
)
lime_exp = explainer.explain_instance(X_test[0], best_model.predict, num_features=5)
lime_exp.show_in_notebook()
Training and hyperparameter tuning for AdaBoost...
Best AdaBoost model: {'learning_rate': 0.1, 'loss': 'square', 'n_estimators': 100}
Test RMSE (scaled): 1.0199944928960172
Test R² (scaled): 0.5913282566763101
Test RMSE (original): 1.0199944928960172
Test R² (original): 0.5913282566763101
LIME Explanation for AdaBoost:
Intercept 2.3758016834849527
Prediction_local [4.07971416]
Right: 3.8797909407665507
In [9]:
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.decomposition import PCA
import numpy as np
import lime.lime_tabular
# Load the dataset
data = pd.read_excel(r"C:/Users/dhanu/OneDrive/Desktop/machinelearning rajeev/ML TRAIN DATASETS/train_t5_embeddings.xlsx")
# Drop any irrelevant columns, such as text or index columns
data = data.drop(columns=['Equation', 'GPT2_Embedding'], errors='ignore')
# Features and target variable
X = data.iloc[:, :-1] # All columns except the last one
y = data['output'] # Target variable
# Scale the target variable (y) to range [0, 5] using MinMaxScaler
scaler_y = MinMaxScaler(feature_range=(0, 5))
y_scaled = scaler_y.fit_transform(y.values.reshape(-1, 1)).flatten()
# Standardize the features
scaler_X = StandardScaler()
X_scaled = scaler_X.fit_transform(X)
# Apply PCA for dimensionality reduction
pca = PCA(n_components=0.99) # Preserve 99% of variance
X_pca = pca.fit_transform(X_scaled)
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_pca, y_scaled, test_size=0.2, random_state=42)
# Gradient Boosting model
model = GradientBoostingRegressor(random_state=42)
# Hyperparameter tuning grid
param_grid = {
'n_estimators': [50, 100], # Number of boosting rounds
'learning_rate': [0.01, 0.1], # Learning rate
'max_depth': [3, 5], # Maximum depth of trees
'subsample': [0.8, 1.0] # Fraction of samples for each tree
}
# Hyperparameter tuning
print(f"Training and hyperparameter tuning for Gradient Boosting...")
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_
print(f"Best Gradient Boosting model: {grid_search.best_params_}")
# Evaluate the model on the test data
y_pred_scaled = best_model.predict(X_test)
# Clip predictions to the range [0, 5] to ensure valid outputs
y_pred_scaled_clipped = np.clip(y_pred_scaled, 0, 5)
# Evaluate performance metrics on the scaled target
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_scaled_clipped))
test_r2 = r2_score(y_test, y_pred_scaled_clipped)
print(f"\nTest RMSE (scaled): {test_rmse}")
print(f"Test R² (scaled): {test_r2}")
# Rescale the predictions and test target back to original range for final evaluation
y_pred_original = scaler_y.inverse_transform(y_pred_scaled_clipped.reshape(-1, 1)).flatten()
y_test_original = scaler_y.inverse_transform(y_test.reshape(-1, 1)).flatten()
original_rmse = np.sqrt(mean_squared_error(y_test_original, y_pred_original))
original_r2 = r2_score(y_test_original, y_pred_original)
print(f"\nTest RMSE (original): {original_rmse}")
print(f"Test R² (original): {original_r2}")
# LIME Explanation for Gradient Boosting
print(f"\nLIME Explanation for Gradient Boosting:")
explainer = lime.lime_tabular.LimeTabularExplainer(
X_train,
training_labels=y_train,
mode='regression',
verbose=True,
feature_names=[f'PCA_{i+1}' for i in range(X_train.shape[1])],
feature_selection='auto'
)
lime_exp = explainer.explain_instance(X_test[0], best_model.predict, num_features=5)
lime_exp.show_in_notebook()
Training and hyperparameter tuning for Gradient Boosting...
Best Gradient Boosting model: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100, 'subsample': 1.0}
Test RMSE (scaled): 0.5817735263264968
Test R² (scaled): 0.8670504517339344
Test RMSE (original): 0.5817735263264968
Test R² (original): 0.8670504517339344
LIME Explanation for Gradient Boosting:
Intercept 2.1464994848153753
Prediction_local [4.45243761]
Right: 5.001019534425308
In [ ]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.decomposition import PCA
import numpy as np
import lime.lime_tabular
# Load the dataset
data = pd.read_excel(r"C:/Users/dhanu/OneDrive/Desktop/machinelearning rajeev/ML TRAIN DATASETS/train_t5_embeddings.xlsx")
# Drop any irrelevant columns, such as text or index columns
data = data.drop(columns=['Equation', 'GPT2_Embedding'], errors='ignore')
# Features and target variable
X = data.iloc[:, :-1] # All columns except the last one
y = data['output'] # Target variable
# Scale the target variable (y) to range [0, 5] using MinMaxScaler
scaler_y = MinMaxScaler(feature_range=(0, 5))
y_scaled = scaler_y.fit_transform(y.values.reshape(-1, 1)).flatten()
# Standardize the features
scaler_X = StandardScaler()
X_scaled = scaler_X.fit_transform(X)
# Apply PCA for dimensionality reduction
pca = PCA(n_components=0.99) # Preserve 99% of variance
X_pca = pca.fit_transform(X_scaled)
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_pca, y_scaled, test_size=0.2, random_state=42)
# Random Forest model
model = RandomForestRegressor(random_state=42)
# Hyperparameter tuning grid
param_grid = {
'n_estimators': [50, 100], # Number of trees in the forest
'max_depth': [3, 5, 10], # Maximum depth of trees
'min_samples_split': [2, 5], # Minimum number of samples required to split an internal node
'min_samples_leaf': [1, 2], # Minimum number of samples required to be at a leaf node
'bootstrap': [True, False] # Whether bootstrap samples are used when building trees
}
# Hyperparameter tuning
print(f"Training and hyperparameter tuning for Random Forest...")
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_
print(f"Best Random Forest model: {grid_search.best_params_}")
# Cross-validation performance on the training set for RMSE and R²
train_cv_r2_scores = cross_val_score(best_model, X_train, y_train, cv=5, scoring='r2')
train_cv_rmse_scores = cross_val_score(best_model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
# Mean and Standard Deviation for R² and RMSE
train_cv_mean_r2 = train_cv_r2_scores.mean()
train_cv_std_r2 = train_cv_r2_scores.std()
train_cv_mean_rmse = np.sqrt(-train_cv_rmse_scores.mean()) # RMSE is negative, so negate to make it positive
train_cv_std_rmse = np.std(np.sqrt(-train_cv_rmse_scores))
# Output the cross-validation metrics on the training set
print(f"\nTraining CV Mean R²: {train_cv_mean_r2}")
print(f"Training CV Std R²: {train_cv_std_r2}")
print(f"Training CV Mean RMSE: {train_cv_mean_rmse}")
print(f"Training CV Std RMSE: {train_cv_std_rmse}")
# Evaluate the model on the test data
y_pred_scaled = best_model.predict(X_test)
# Clip predictions to the range [0, 5] to ensure valid outputs
y_pred_scaled_clipped = np.clip(y_pred_scaled, 0, 5)
# Evaluate performance metrics on the scaled target
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_scaled_clipped))
test_r2 = r2_score(y_test, y_pred_scaled_clipped)
print(f"\nTest RMSE (scaled): {test_rmse}")
print(f"Test R² (scaled): {test_r2}")
# Rescale the predictions and test target back to original range for final evaluation
y_pred_original = scaler_y.inverse_transform(y_pred_scaled_clipped.reshape(-1, 1)).flatten()
y_test_original = scaler_y.inverse_transform(y_test.reshape(-1, 1)).flatten()
original_rmse = np.sqrt(mean_squared_error(y_test_original, y_pred_original))
original_r2 = r2_score(y_test_original, y_pred_original)
print(f"\nTest RMSE (original): {original_rmse}")
print(f"Test R² (original): {original_r2}")
# LIME Explanation for Random Forest
print(f"\nLIME Explanation for Random Forest:")
explainer = lime.lime_tabular.LimeTabularExplainer(
X_train,
training_labels=y_train,
mode='regression',
verbose=True,
feature_names=[f'PCA_{i+1}' for i in range(X_train.shape[1])],
feature_selection='auto'
)
lime_exp = explainer.explain_instance(X_test[0], best_model.predict, num_features=5)
lime_exp.show_in_notebook()
Training and hyperparameter tuning for Random Forest...
In [ ]: